import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from scipy.stats import zscore
# read into pandas dataframe
mydata = pd.read_csv('vehicle.csv')
# Check first 5 records
mydata.head()
# shape
print(mydata.shape)
# To see the data type of each of the variable, number of values entered in each of the variables
mydata.info()
There are signs of missing values from above info: Total entries is 846 but there are features less than 846 data points.
# Check null values
mydata.isnull().sum()
There are null values at 14 features as predicted. This nulls needs to be replaced or dropped.
# the output results provide the five number summary of the data.
mydata.describe(include="all").transpose()
Skewness can be noticed at following features: radius_ratio, pr.axis_aspect_ratio, max.length_aspect ratio, scaled_variance, scaled_variance.1, skewness_about and skewness_about1
# Rows containing duplicate data
duplicate_rows_df = mydata[mydata.duplicated()]
print("number of duplicate rows:", duplicate_rows_df.shape)
#Custom missing values check
missing_values = ["n/a", "na", "--",".","?","??"]
mydata = pd.read_csv("vehicle.csv", na_values = missing_values)
# Total missing values for each feature
print (mydata.isnull().sum())
# rename columns which has '.'
mydata.rename(columns = {'pr.axis_aspect_ratio':'pr_axis_ar' , 'max.length_aspect_ratio':'max.length_ar',
'pr.axis_rectangularity':'pr_axis_rectangularity', 'max.length_rectangularity':'max_lr' ,
'scaled_variance.1':'scaled_variance_1' , 'scaled_radius_of_gyration.1':'scaled_radius_of_gyration_1' ,
'skewness_about.1':'skewness_about_1' , 'skewness_about.2':'skewness_about_2' , 'class':'CLASS' },
inplace = True)
we have renamed class to CLASS cause 'class' word is a keyword so needs to be renamed to process this feature.
mydata['circularity'].fillna(mydata.circularity.mean(), inplace = True)
# Nulls of feature - 'circularity' is replaced by mean
mydata['distance_circularity'].fillna(mydata.distance_circularity.mean(), inplace = True)
# Nulls of feature - 'distance_circularity' is replaced by mean
mydata['radius_ratio'].fillna(mydata.radius_ratio.median(), inplace = True)
# Nulls of feature - 'radius_ratio' is replaced by median
mydata['pr_axis_ar'].fillna(mydata.pr_axis_ar.median(), inplace = True)
# Nulls of feature - 'pr_axis_ar' is replaced by median
mydata['scatter_ratio'].fillna(mydata.scatter_ratio.mean(), inplace = True)
# Nulls of feature - 'scatter_ratio' is replaced by mean
mydata['elongatedness'].fillna(mydata.elongatedness.mean(), inplace = True)
# Nulls of feature - 'elongatedness' is replaced by mean
mydata['pr_axis_rectangularity'].fillna(mydata.pr_axis_rectangularity.mean(), inplace = True)
# Nulls of feature - 'pr_axis_rectangularity' is replaced by mean
mydata['scaled_variance'].fillna(mydata.scaled_variance.median(), inplace = True)
# Nulls of feature - 'scaled_variance' is replaced by median
mydata['scaled_variance_1'].fillna(mydata.scaled_variance_1.median(), inplace = True)
# Nulls of feature - 'scaled_variance_1' is replaced by median
mydata['scaled_radius_of_gyration'].fillna(mydata.scaled_radius_of_gyration.mean(), inplace = True)
# Nulls of feature - 'scaled_radius_of_gyration' is replaced by mean
mydata['scaled_radius_of_gyration_1'].fillna(mydata.scaled_radius_of_gyration_1.mean(), inplace = True)
# Nulls of feature - 'scaled_radius_of_gyration_1 is replaced by mean
mydata['skewness_about'].fillna(mydata.skewness_about.median(), inplace = True)
# Nulls of feature - 'skewness_about' is replaced by median
mydata['skewness_about_1'].fillna(mydata.skewness_about_1.median(), inplace = True)
# Nulls of feature - 'skewness_about_1' is replaced by median
mydata['skewness_about_2'].fillna(mydata.skewness_about_2.mean(), inplace = True)
# Nulls of feature - 'skewness_about_2' is replaced by mean
We have used median on few features to remove null values because outliers affects the mean
# number of missing values (only the ones recognised as missing values) in each of the attributes
pd.DataFrame( mydata.isnull().sum(), columns= ['Number of missing values'])
We have removed all null values by either mean or median as per outliers
# Create boxplot for column="radius_ratio"
mydata.boxplot(column="radius_ratio",return_type='axes',figsize=(3,3))
# Create boxplot for column="pr_axis_ar'"
mydata.boxplot(column="pr_axis_ar",return_type='axes',figsize=(3,3))
these two features above are examples for outliers which was predicted earlier
# Outliers using statistical rules
Q1 = mydata.quantile(0.25)
Q3 = mydata.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
np.where((mydata < (Q1 - 1.5 * IQR)) | (mydata > (Q3 + 1.5 * IQR))) # data points with outliers
mydata_out = mydata[~((mydata < (Q1 - 1.5 * IQR)) |(mydata > (Q3 + 1.5 * IQR))).any(axis=1)] # total rows without outliers
mydata_out.shape
#23 records can be removed or corrected but can lead to new set of outliers
mydata_copy = mydata.copy()
# Replace every outlier on the lower side by the lower whisker
for i, j in zip(np.where(mydata_copy < Q1 - 1.5 * IQR)[0], np.where(mydata_copy < Q1 - 1.5 * IQR)[1]):
whisker = Q1 - 1.5 * IQR
mydata_copy.iloc[i,j] = whisker[j]
#Replace every outlier on the upper side by the upper whisker
for i, j in zip(np.where(mydata_copy > Q3 + 1.5 * IQR)[0], np.where(mydata_copy > Q3 + 1.5 * IQR)[1]):
whisker = Q3 + 1.5 * IQR
mydata_copy.iloc[i,j] = whisker[j]
mydata_copy.shape, mydata.shape
Outliers has been dealt with using IQR rule.
np.where((mydata_copy < (Q1 - 1.5 * IQR)) | (mydata_copy > (Q3 + 1.5 * IQR))) # rows with outliers after using IQR technique
We can see slight reduction in outliers. we can also expect outliers to remain but we can ignore them. we can further reduce to fewer outliers if we change the multiplier to a lesser value (ie.. lesser than 1.5 times of IQR)
#Using Countplot to count number of different CLASS in the dataset
sns.countplot(y=mydata_copy.CLASS)
plt.show()
# we are re-representing our target feature - CLASS to numeric
labelencoder = LabelEncoder()
mydata_copy['CLASS'] = labelencoder.fit_transform(mydata_copy.CLASS) # returns label encoded variable(s)
mydata_copy.CLASS = mydata_copy.CLASS.astype('int') # Convert data type to category for feature = class
#Using Countplot to count number of different CLASS in the dataset
sns.countplot(y=mydata_copy.CLASS)
plt.show()
Car has the most data points
0 -> Bus 1 -> Car 2 -> Van
sns.pairplot(mydata_copy)
Here we can see few features are having postive and negative(10 features) linear relationships so that we can reduce the dimensions using PCA later. We can see these relations through correlations matrix as well which is below.
We can see the outliers on the diagonal graphs as well. And few features which are not in noramal distributions can be noticed.
# Correlation and heatmap for the same
plt.figure(figsize=(20,10))
c= mydata_copy.corr()
sns.heatmap(c,annot=True)
c
We can notice that features(assumed independent variables) are not having any relationships to predict a CLASS(dependent/target variable)
Highest +ve correlation wrt to CLASS = elongatedness (0.33) Highest -ve correlation wrt to CLASS = scaled_variance (-0.30)
We can use all independent features for our SVM model
#Splitting data into 70:30
X = mydata_copy.drop('CLASS',axis=1) # all independent variables
Y = mydata_copy['CLASS'] # seperating dependent variable (values = 0,1,2)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
#Use Standard scale on all columns and then we will predict
sc_x = StandardScaler()
sc_X = sc_x.fit_transform(X) # we need this for KFold
sc_x_train = sc_x.fit_transform(x_train)
sc_x_test = sc_x.transform(x_test)
svm_model = svm.SVC(kernel='linear' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
svm_model.fit(sc_x_train , y_train)
y_pred = svm_model.predict(sc_x_test)
confusion_matrix(y_test , y_pred) # Calling getAccuracy function instead of usinig array comparison to get the %age accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=3)) # digits parameter too handle multiclass
After trying out various combinations of kernel and gamma values, I have found that kernel = 'linear' and gamma = 'auto' gives us the highest accuracy. Also other parameters such as precision,recall and f1-score is highest for the above combination.
Accuracy = 96.06%
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = svm.SVC(kernel='poly' , gamma='auto')
results = cross_val_score(model, sc_X, Y, cv=kfold)
#get the mean of each fold
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
num_folds = 30
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = svm.SVC(kernel='poly' , gamma='auto')
results = cross_val_score(model, sc_X, Y, cv=kfold)
#get the mean of each fold
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
num_folds = 40
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = svm.SVC(kernel='poly' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
results = cross_val_score(model, sc_X, Y, cv=kfold)
print(results)
#get the mean accuracy of each fold
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
results_prec = cross_val_score(model, sc_X, Y, cv=kfold, scoring='precision_macro')
results_rec = cross_val_score(model, sc_X, Y, cv=kfold, scoring='recall_macro')
results_f1 = cross_val_score(model, sc_X, Y, cv=kfold, scoring='f1_macro')
#get the mean precison of each fold
print("Precison: %.3f%% (%.3f%%)" % (results_prec.mean()*100.0, results_prec.std()*100.0))
#get the mean recall of each fold
print("Recall: %.3f%% (%.3f%%)" % (results_rec.mean()*100.0, results_rec.std()*100.0))
#get the mean f1 score of each fold
print("F1 Score: %.3f%% (%.3f%%)" % (results_f1.mean()*100.0, results_f1.std()*100.0))
Accuracy is highest when number of folds is set to 40.
Accuracy score can vary between 72.95% and 93.37%
We can try with stratified kfold as well cause it usually performs better than normal kfold
k=50
stratified_model=StratifiedKFold(n_splits=k,random_state=7)
model = svm.SVC(kernel='poly' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
results=cross_val_score(model ,sc_X,Y,cv=stratified_model)
print('Average Accuracy:',results.mean())
print('STD',results.std())
results_prec = cross_val_score(model, sc_X, Y, cv=kfold, scoring='precision_macro')
results_rec = cross_val_score(model, sc_X, Y, cv=kfold, scoring='recall_macro')
results_f1 = cross_val_score(model, sc_X, Y, cv=kfold, scoring='f1_macro')
#get the mean precison of each fold
print("Precison: %.3f%% (%.3f%%)" % (results_prec.mean()*100.0, results_prec.std()*100.0))
#get the mean recall of each fold
print("Recall: %.3f%% (%.3f%%)" % (results_rec.mean()*100.0, results_rec.std()*100.0))
#get the mean f1 score of each fold
print("F1 Score: %.3f%% (%.3f%%)" % (results_f1.mean()*100.0, results_f1.std()*100.0))
StratifiedKFold has shown higher accuracy than normal kfold.
Have tried with different values for kernels of svm and k to find out -> kernel=poly and k=50 gives the best accuracy
# we have already scaled the data and it will be used again
print(sc_X)
covMatrix = np.cov(sc_X,rowvar=False)
print(covMatrix)
plt.figure(figsize=(20,10))
sns.heatmap(covMatrix,annot=True)
Covariance matrix is closely related to correlation matrix, which helps us identifying the relationships between the features.
Here from heatmap, I have noticed that this is similar to the heatmap as before
pca = PCA(n_components=18) # we are passing all 18 compenents (independent features without our target)
pca.fit(sc_X)
print(pca.explained_variance_) # sorted eigen values which helps us explain the variance of each component
print(pca.components_) # eigen vectors for all 18 components, hard to visualize with dimensions more than 3
print(pca.explained_variance_ratio_) # this is percentage variance of each component as calculated before
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
we are asked to capture components with 95% variance -> we can notice that at 8th eigen value, variance is 95%
data_reduced= PCA(n_components=8) # reduced features to 8
data_reduced.fit(sc_X)
print(data_reduced.components_)
print(data_reduced.explained_variance_ratio_)
Xdata_reduced = data_reduced.transform(sc_X)
Xdata_reduced
sns.pairplot(pd.DataFrame(Xdata_reduced))
We can see that relationships of components have changed to independent from dependent (dependence in features was seen in earlier pairplot)
Also distributions of data points are closer to binomial.
#Splitting data into 70:30
X = Xdata_reduced # all 8 PCA independent variables
Y = mydata_copy['CLASS'] # seperating dependent variable (values = 0,1,2) which was earlier defined as well
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
Y.shape
svm_model = svm.SVC(kernel='rbf' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
svm_model.fit(x_train , y_train)
y_pred = svm_model.predict(x_test)
confusion_matrix(y_test , y_pred) # Calling getAccuracy function instead of usinig array comparison to get the %age accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=3))
Have tried for various kernel values and have found that RBF perfomed the best.
There is a slight drop in accuracy and is expected because there is slight loss of information after PCA of approx=5%
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = svm.SVC(kernel='rbf' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
results = cross_val_score(model, X, Y, cv=kfold)
#get the mean of each fold
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
Cross validation performed with 94.57% Accuracy with 4.83% STD
num_folds = 60
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = svm.SVC(kernel='rbf' , gamma='auto')
results = cross_val_score(model, X, Y, cv=kfold)
#get the mean of accuracy for each fold
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
Cross validation performed with 94.45% Accuracy with 3.91% STD
Have tried for various kernels,num_folds and have found rbf with folds=50 again to be the best
k=60
stratified_model=StratifiedKFold(n_splits=k,random_state=7)
model = svm.SVC(kernel='rbf' , gamma='auto') #kernel = 'linear'/'poly'/'rbf'/'sigmoid'
results_acc=cross_val_score(model ,X,Y,cv=stratified_model)
results_prec=cross_val_score(model ,X,Y,cv=stratified_model, scoring='precision_macro')
results_recc=cross_val_score(model ,X,Y,cv=stratified_model, scoring='recall_macro')
results_f1=cross_val_score(model ,X,Y,cv=stratified_model, scoring='f1_macro')
print('Average Accuracy:',results_acc.mean())
print('STD',results_acc.std())
print('Average precision:',results_prec.mean())
print('STD',results_prec.std())
print('Average recall:',results_recc.mean())
print('STD',results_recc.std())
print('Average f1 score:',results_f1.mean())
print('STD',results_f1.std())
Stratified Kfold perfroms at 94.60 accuracy, slightly better than normal KFold
SVM model with kernel = 'linear' and gamma = 'auto'
Accuracy = 96.06% , precison = 95.7% , recall = 95.7% , F1 Score = 95.7%
KFold with folds = 40 with SVM Model (Kernel = 'Poly' and Gamma = 'Auto')
Accuracy = 83.16% and STD = 10.20%, Precision = 88.02% and STD = 09.53%, Recall = 78.47% and STD = 11.59%, F1 score = 80.10% and STD = 11.45%
Stratified KFold with folds = 50 with SVM Model (Kernel = 'Poly' and Gamma = 'Auto')
Accuracy = 83.47% and STD = 09.63%, Precision = 88.02% and STD = 09.53%, Recall = 78.47% and STD = 11.59%, F1 score = 80.10% and STD = 11.45%
SVM model with kernel = 'rbf' and gamma = 'auto'
Accuracy = 92.51% , precison = 91.90% , recall = 92.00% , F1 Score = 91.9%
KFold with folds = 40 with SVM Model (Kernel = 'rbf' and Gamma = 'Auto')
Accuracy = 94.57% and STD = 4.83%
Stratified KFold with folds = 60 with SVM Model (Kernel = 'Poly' and Gamma = 'Auto')
Accuracy = 83.47% and STD = 09.63%, Precision = 88.02% and STD = 09.53%, Recall = 78.47% and STD = 11.59%, F1 score = 80.10% and STD = 11.45%
We can see SVM with raw data provides highest accuracy but this would be misleading cause of overfitting and the assumptions of variables to be independent and normal distributed is not met in raw as per our analysis.
We can safely move the SVM model after PCA and stratified KFold after PCA to production as we can see they provide better overall performance in terms of accuracy, precision, recall and F1 scores. We will not consider normal kfold because it has draw backs due to which it can give false high accuracies, stratified is usually prefered over normal KFold.
Stratified model more reliable, rearranging is done as to ensure each fold is a good representative of the whole cause we have multiclass target variable which is imbalanced.